In this vignette for the MiniChip package we will generate a GRanges object of ChIP-seq peaks for Adnp, as well as a matched Granges object of randomized peaks across the mouse genome. Then we will plot the distribution of Adnp read counts (per Million reads) across the peak summits.
suppressPackageStartupMessages({
library(MiniChip)
library(ComplexHeatmap)
library(GenomicFeatures)
library(ggplot2)
library(reshape2)
library(BSgenome.Mmusculus.UCSC.mm10)
#library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(TxDb.Mmusculus.UCSC.mm10.ensGene)
library(EnsDb.Mmusculus.v79)
})First we load the narrowPeak output files generated by MACS peak finding for all 3 Adnp ChIP replicates, name the culumns in a meaningful way, and turn them into GRanges objects.
peaks1.d <- read.table(system.file("extdata", "Adnp_rep1_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
peaks2.d <- read.table(system.file("extdata", "Adnp_rep2_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
peaks3.d <- read.table(system.file("extdata", "Adnp_rep3_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
names(peaks1.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
names(peaks2.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
names(peaks3.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
peaks1 <- makeGRangesFromDataFrame(peaks1.d,
keep.extra.columns=TRUE,
ignore.strand=TRUE,
seqinfo=NULL,
seqnames.field=c("chr"),
start.field=c("start"),
end.field=c("end"),
starts.in.df.are.0based=FALSE)
peaks2 <- makeGRangesFromDataFrame(peaks2.d,
keep.extra.columns=TRUE,
ignore.strand=TRUE,
seqinfo=NULL,
seqnames.field=c("chr"),
start.field=c("start"),
end.field=c("end"),
starts.in.df.are.0based=FALSE)
peaks3 <- makeGRangesFromDataFrame(peaks3.d,
keep.extra.columns=TRUE,
ignore.strand=TRUE,
seqinfo=NULL,
seqnames.field=c("chr"),
start.field=c("start"),
end.field=c("end"),
starts.in.df.are.0based=FALSE)Then we select only the peaks which occur in at least 2 of the 3 replicates using the GRanges subsetByOverlaps and overlapsAny functions.
# peaks that occur in at least 2/3 replicates
peaks3.1 <- subsetByOverlaps(peaks3,peaks1)
peaks3.2 <- subsetByOverlaps(peaks3,peaks2)
peaks1.2 <- subsetByOverlaps(peaks1,peaks2)
#keep all 3.1 peaks, find 3.2 and 1.2 peaks that do not overlap 3.1 peaks
# peaks which are found in 3 and 2, but not in 3 and 1, and not in 1 and 2
peaks3.2u <- peaks3.2[overlapsAny(peaks3.2,peaks3.1) ==FALSE & overlapsAny(peaks3.2,peaks1.2) == FALSE]
# peaks which are found in 1 and 2, but not in 3 and 1, and not in 3 and 2
peaks1.2u <- peaks1.2[overlapsAny(peaks1.2,peaks3.1) ==FALSE & overlapsAny(peaks1.2,peaks3.2) == FALSE]
# add together all 3.1 peaks as well as unique 3.2 and 1.2 peaks
peaks <- c(peaks3.1,peaks3.2u,peaks1.2u)We can now generate a peaknames object describing the peaks GRanges object we have just selected (for later use), and look at the peaks GRanges object to make sure it countains seqnames (chromosomes), start, end, strand, summit, and name filedds, as required by many functions in the MiniChip package.
peaknames <- "Adnp_peaks"
peaks
#> GRanges object with 1453 ranges and 7 metadata columns:
#> seqnames ranges strand | name score
#> <Rle> <IRanges> <Rle> | <character> <integer>
#> [1] chr11 3193356-3193462 * | Adnp_r3_CHIP_peak_93 19
#> [2] chr11 3197533-3197624 * | Adnp_r3_CHIP_peak_94 19
#> [3] chr11 6413061-6413136 * | Adnp_r3_CHIP_peak_95 32
#> [4] chr11 6413384-6413485 * | Adnp_r3_CHIP_peak_96 33
#> [5] chr11 48821708-48821764 * | Adnp_r3_CHIP_peak_97 32
#> ... ... ... ... . ... ...
#> [1449] chr11 121414173-121414488 * | Adnp_rep1_peak_6526 166
#> [1450] chr11 121457872-121458237 * | Adnp_rep1_peak_6527 221
#> [1451] chr11 121675897-121676260 * | Adnp_rep1_peak_6532 319
#> [1452] chr11 121884945-121885189 * | Adnp_rep1_peak_6534 86
#> [1453] chr11 121977957-121978411 * | Adnp_rep1_peak_6536 185
#> empty foldchange pvalue qvalue summit
#> <character> <numeric> <numeric> <numeric> <integer>
#> [1] . 1.95180 6.55661 1.90294 32
#> [2] . 2.28840 6.55834 1.90457 22
#> [3] . 5.99934 8.18814 3.20789 60
#> [4] . 5.10858 8.41215 3.35880 83
#> [5] . 6.06879 8.29235 3.29854 32
#> ... ... ... ... ... ...
#> [1449] . 7.10540 19.9389 16.69339 142
#> [1450] . 6.76789 25.5535 22.11681 169
#> [1451] . 10.33513 35.6853 31.95324 210
#> [1452] . 4.26805 11.5416 8.67331 178
#> [1453] . 7.53603 21.8940 18.57712 164
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengthsThen we use the SimulatePeaks function from the MiniChip package to generate a GRanges object called random.peaks that contains the same number of peaks (of the same lengths) as the original peaks GRanges object, but ech one shuffled to a randomly chosen location in the mouse genome. This is a useful control object to have when comparing the overlap of ChIP peaks with genomic annotations, for example genes or repeat elements.
First, we select the bam files of mapped reads that we want to use from the gbuehler deepSeqRepos….
#select bam files from Adnp experiment
bamFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bam$")
#all.bamFiles <- list.files("/tungstenfs/scratch/gbuehler/deepSeqRepos/bam/", full.names=TRUE,pattern="*bam$")
#all.bamFiles <- grep("test",all.bamFiles,value=TRUE,invert=TRUE)
#bamFiles <- grep("1950F",all.bamFiles,value=TRUE)[1:8]
bamFiles
#> [1] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_ChIP_r1_chr11.bam"
#> [2] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_ChIP_r2_chr11.bam"
#> [3] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_ChIP_r3_chr11.bam"
#> [4] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_Input_r1_chr11.bam"
#> [5] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_Input_r2_chr11.bam"
#> [6] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_Input_r3_chr11.bam"…and extract short but meaningful names for the bam files decribing what has been chipped.
bamNames <- gsub(paste(system.file("extdata", package = "MiniChip"),"/",sep=""),"",bamFiles)
bamNames <- gsub("_chr11.bam","",bamNames)
bamNames
#> [1] "Adnp_wt_ChIP_r1" "Adnp_wt_ChIP_r2" "Adnp_wt_ChIP_r3" "Adnp_wt_Input_r1"
#> [5] "Adnp_wt_Input_r2" "Adnp_wt_Input_r3"You can select any bamFile that you are interested in, and the number of bamFiles you can choose is unlimited.
To check if the enrichments are biased towards GC rich regions, we can use the GCbias function to plot GC content versus the read counts for each alignment file.
GCbias(bamFiles[1:2],bamNames[1:2],pe="none", restrict="chr11",GCprob=TRUE, genome=BSgenome.Mmusculus.UCSC.mm10)It looks like there is no GC bias in our selected bam files, so we can continue to calculate the heatmaps.
Now we define the span (distance from peak summit to left and right, in this case 3025bp), and step size for windows across the peaks (in this case 50bp). Furthermore we crate a summits GRanges object that contains the genomic coordinates of the peak summits as s trat and end positions.
Then we use the SummitHeatmap function for the MiniChip package to calculate the counts per million (cpm) reads of each bam file in each window across the summit of each peak in the peaks object.
span <- 3025
step <- 50
summits <- peaks
start(summits) <- start(summits) + summits$summit
end(summits) <- start(summits)
names(summits) <- elementMetadata(summits)[,"name"]
counts <- SummitHeatmap(summits,bamFiles,bamNames,span,step,useCPM=TRUE,readShiftSize=75)
#>
#> ========== _____ _ _ ____ _____ ______ _____
#> ===== / ____| | | | _ \| __ \| ____| /\ | __ \
#> ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
#> ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
#> ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
#> ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
#> Rsubread 2.2.2
#>
#> //========================== featureCounts setting ===========================\\
#> || ||
#> || Input files : 6 BAM files ||
#> || o Adnp_wt_ChIP_r1_chr11.bam ||
#> || o Adnp_wt_ChIP_r2_chr11.bam ||
#> || o Adnp_wt_ChIP_r3_chr11.bam ||
#> || o Adnp_wt_Input_r1_chr11.bam ||
#> || o Adnp_wt_Input_r2_chr11.bam ||
#> || o Adnp_wt_Input_r3_chr11.bam ||
#> || ||
#> || Annotation : R data.frame ||
#> || Dir for temp files : . ||
#> || Threads : 20 ||
#> || Level : feature level ||
#> || Paired-end : no ||
#> || Multimapping reads : not counted ||
#> || Multi-overlapping reads : counted ||
#> || Min overlapping bases : 1 ||
#> || Read shift : 75 to downstream ||
#> || Read reduction : to 5' end ||
#> || ||
#> \\============================================================================//
#>
#> //================================= Running ==================================\\
#> || ||
#> || Load annotation file .Rsubread_UserProvidedAnnotation_pid291812 ... ||
#> || Features : 175813 ||
#> || Meta-features : 1453 ||
#> || Chromosomes/contigs : 1 ||
#> || ||
#> || Process BAM file Adnp_wt_ChIP_r1_chr11.bam... ||
#> || Single-end reads are included. ||
#> || Total alignments : 1668024 ||
#> || Successfully assigned alignments : 245516 (14.7%) ||
#> || Running time : 0.01 minutes ||
#> || ||
#> || Process BAM file Adnp_wt_ChIP_r2_chr11.bam... ||
#> || Single-end reads are included. ||
#> || Total alignments : 1466194 ||
#> || Successfully assigned alignments : 193541 (13.2%) ||
#> || Running time : 0.01 minutes ||
#> || ||
#> || Process BAM file Adnp_wt_ChIP_r3_chr11.bam... ||
#> || Single-end reads are included. ||
#> || Total alignments : 1899151 ||
#> || Successfully assigned alignments : 201428 (10.6%) ||
#> || Running time : 0.01 minutes ||
#> || ||
#> || Process BAM file Adnp_wt_Input_r1_chr11.bam... ||
#> || Single-end reads are included. ||
#> || Total alignments : 1750427 ||
#> || Successfully assigned alignments : 151962 (8.7%) ||
#> || Running time : 0.01 minutes ||
#> || ||
#> || Process BAM file Adnp_wt_Input_r2_chr11.bam... ||
#> || Single-end reads are included. ||
#> || Total alignments : 1570400 ||
#> || Successfully assigned alignments : 133980 (8.5%) ||
#> || Running time : 0.01 minutes ||
#> || ||
#> || Process BAM file Adnp_wt_Input_r3_chr11.bam... ||
#> || Single-end reads are included. ||
#> || Total alignments : 1576344 ||
#> || Successfully assigned alignments : 127958 (8.1%) ||
#> || Running time : 0.01 minutes ||
#> || ||
#> || Write the final count table. ||
#> || Write the read assignment summary. ||
#> || ||
#> \\============================================================================//This will return the counts object, a list of length 8 (since we are using 8 bam files), where each element of the list contains a matrix of cpm values across all windows for all peaks. If you want to also save a seperately clustered heatmap for each bam file to a file, set plotHM=TRUE.
To plot heatmaps for each sample (bam file) sorted by the first bam file, you can use the DrawSummitHeatmaps function. By providing shorter names for the samples you can ensure nice heatmap titles. Here we select only the ChIP samples (and not the Input samples).
ht_list <- DrawSummitHeatmaps(counts[c(1,2,3)], bamNames[c(1,2,3)],orderSample=1, use.log=TRUE,
bottomCpm=c(0,0,0), medianCpm = c(2,2,2), topCpm = c(4,4,4), orderWindows=2, TargetHeight=500)
draw(ht_list, padding = unit(c(3, 8, 8, 2), "mm"))First we will calculate the mean counts per million (cpm) values for the three replicates of Adnp Chip and Input using the SummarizeHeatmaps function.
inx.ChIP <- grep("ChIP",names(counts))
Adnp <- grep("Adnp",names(counts[inx.ChIP]),value=TRUE)
Input <- grep("Input",names(counts),value=TRUE)[1:3]
sampleList <- list(Adnp=Adnp,Input=Input)
meanCounts <- SummarizeHeatmaps(counts,sampleList)
#> [1] "summarizing group Adnp"
#> [1] "summarizing group Input"Then we can split the epaks into those that overlap a promoter and those that do not, and generate the cumululative plots using the CumulativePlots function.
#select peaks that overlap a TSS (+/- 1kb)
#txdb=loadDb("/tungstenfs/scratch/gbuehler/michi/Annotations/GENCODE/Mouse/release_M23/gencode.vM23.annotation.txdb.sqlite")
txdb=TxDb.Mmusculus.UCSC.mm10.ensGene
TSSs <- promoters(txdb,upstream=1000,downstream=1000)
#> Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
#> chr4_JH584295_random. Note that ranges located on a sequence whose
#> length is unknown (NA) or on a circular sequence are not considered
#> out-of-bound (use seqlengths() and isCircular() to get the lengths and
#> circularity flags of the underlying sequences). You can use trim() to
#> trim these ranges. See ?`trim,GenomicRanges-method` for more
#> information.
summit_TSS_overlap <- overlapsAny(summits,TSSs)
summit_TSS_overlap_names <- names(summits[summit_TSS_overlap == TRUE])
#cumulative plots
CumulativePlots(meanCounts,bamNames = names(meanCounts),
span=3025,step=50,summarizing = "mean",overlapNames = summit_TSS_overlap_names)This suggests that some of the Adnp peaks overlap promoters, and that they have as much enrichment in Adnp as the ones which do not overlap promoters. To see which ones overlap promoters, we can add the promoter annotation to the heatmaps, using the AnnotationHeatmap function.
promoters <- promoters(txdb,upstream=150,downstream=150)
annoname <- "promoters"
promoters.overlap <- AnnotationHeatmap(summits,annotation=promoters,annoname,span,step)
annos <- list(promoters.overlap)
names(annos) <- c("promoters")
allCounts <- c(meanCounts,annos)
cols <- c("darkblue","darkred","darkgreen")
upper.cpm <- c(rep(2,2),rep(1,1))
splitHM <- ifelse(summit_TSS_overlap==TRUE,"TSS","no TSS")
heatmap_list <- DrawSummitHeatmaps(allCounts, names(allCounts), plotcol= cols, medianCpm = upper.cpm, orderSample = 1, use.log=TRUE, summarizing = "mean", orderWindows=2,MetaScale=c("all","all","individual"), TargetHeight=500,
splitHM=splitHM)
draw(heatmap_list, padding = unit(c(3, 8, 8, 2), "mm"),show_heatmap_legend=FALSE)#get bigwig file names
all.bwFiles <- list.files("/tungstenfs/scratch/gbuehler/deepSeqRepos/bigwig/", full.names=TRUE,pattern="*bw$")
all.bwFiles <- grep("uni",all.bwFiles,value=TRUE)
bwFiles <- grep("872F",all.bwFiles,value=TRUE)[1:2]
#bwFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bw$")
#plot
plotTracks(bwFilesPlus=bwFiles,bwNames=c(rep("Adnp",2)),txdb=txdb,EnsDb=EnsDb.Mmusculus.v79,
plotregion=peaks[peaks$score==max(peaks$score)],plotExtension=5000,plotranges=rep(0.8,2))